Exploratory Data Analysis

Do an exploratory data analysis of a matrix of expression values. The data consists of expression values for samples that were treated with DMSO and TSA. The samples were measured using three technologies: bulk, IFC96, IFC800. See the two RDS files counts.RDS and phenodata.RDS

Data Import

x = readRDS("counts.RDS")
anno = readRDS("phenodata.RDS")
head(anno)
##                       Treatment Technology
## bulk.DMSO-Pre-96IFC        DMSO       bulk
## bulk.TSA-Pre-96IFC          TSA       bulk
## bulk.DMSO-Pre-800IFC       DMSO       bulk
## bulk.TSA-Pre-800IFC         TSA       bulk
## bulk.DMSO-Post-800IFC      DMSO       bulk
## bulk.TSA-Post-800IFC        TSA       bulk

Compute and visualize basic statistics

How many replicates are there for each combination of factor levels?

unique_tech<-unique(anno$Technology)
unique_treat<-unique(anno$Treatment)
row<-length(unique_tech)
col<-length(unique_treat)
frequency_table<-matrix(0,nrow = row, ncol = col)
rownames(frequency_table)<-unique_tech
colnames(frequency_table)<-unique_treat

n=length(anno$Treatment)
for (i in 1:n){
  frequency_table[anno$Technology[i],anno$Treatment[i]]=frequency_table[anno$Technology[i],anno$Treatment[i]]+1
}
frequency_table
##        DMSO TSA
## bulk      3   3
## IFC96    20  20
## IFC800   20  20

How many genes have an expression value above 0 in each sample? x>0 converts matrix to a matrix with same dimension but values of entries is 0 or 1 colSums: sum over the columns (sum over each samples). does sum over columns for all columns. it gives a row vector genes_above_zero: a vector that has as many as components as there are for columns

#x>0 converts matrix to a matrix with same dimension but values of entries is 0 or 1
#colSums: sum over the columns (sum over each samples). does sum over for a given column and you do that for all columns. 
#colSums gives a row vector
#genes_above_zero: a vector that has as many as components as there are for columns

genes_above_zero = colSums(x > 0)
head(genes_above_zero)
##   bulk.DMSO-Pre-96IFC    bulk.TSA-Pre-96IFC  bulk.DMSO-Pre-800IFC 
##                 15163                 15261                 15404 
##   bulk.TSA-Pre-800IFC bulk.DMSO-Post-800IFC  bulk.TSA-Post-800IFC 
##                 16182                 15202                 16167

Normalize the data

Scale the columns so that the total sum of all columns are identical

scale= centers and scales. centering: subtract the average from every column every column is 0 centered. didnt want to do this scale:should pass a vector which has same elements as there are columns. x has 86 columns. this vector should have 86 elements colSums(x) gives a vector contains as many cols as there are in cols in x. each column is going to be divided by sum of each columns

# Multiply by a million to better separate 0 from the other values
x_scaled = scale(x, center = FALSE, scale = colSums(x)*1e-6)

head(colSums(x_scaled))
##   bulk.DMSO-Pre-96IFC    bulk.TSA-Pre-96IFC  bulk.DMSO-Pre-800IFC 
##                 1e+06                 1e+06                 1e+06 
##   bulk.TSA-Pre-800IFC bulk.DMSO-Post-800IFC  bulk.TSA-Post-800IFC 
##                 1e+06                 1e+06                 1e+06

Transform the data to log-scale

Use the function log1p to transform the data to log-scale log1p=log(1+x) to prevent crash of log0

x_log = log1p(x_scaled)

Visualize the distribution of the expression values

This is pre-processing step for violin plots and box plots

# prepare the data-frame and the colors for each plot
x_data_frame<-as.data.frame(x_log)
ncol=ncol(x_data_frame)
nrow=nrow(x_data_frame)
# x_color contains color information for each sample
x_color<-vector(mode="character", length=ncol)

for (i in 1:ncol){
  treat<-anno[i,1]
  tech<-anno[i,2]
  if (treat=="DMSO"&& tech=="bulk"){
    x_color[i]<-"DMSO+bulk"
  }
  
  else if (treat=="TSA"&& tech=="bulk"){
    x_color[i]<-"TSA+bulk"
  }
  
  else if (treat=="DMSO"&& tech=="IFC96"){
    x_color[i]<-"DMSO+IFC96"
  }
  
  else if (treat=="TSA"&& tech=="IFC96"){
    x_color[i]<-"TSA+IFC96"
  }
  
  else if (treat=="DMSO"&& tech=="IFC800"){
    x_color[i]<-"DMSO+IFC800"
  }
  
  else if (treat=="TSA"&& tech=="IFC800"){
    x_color[i]<-"TSA+IFC800"
  }
  
  else{
    x_color[i]<-"unknown"
  }
  
}

Use violin plots to visualize the distribution of the expression values Group and color by experimental factors.

library(ggplot2)
#assign a new variable
flat_expressions<-x_log
#setting dimension of flat_X to NULL means it transforms to a single vector. (flattening the gene expression into one long vector)
dim(flat_expressions)<-NULL
# Create a vector of sample names, to be used as factor
# If you use "each" parameter it works like this:1 is repeated nrow times, 2 repeated nrow times,......6 repeated nrow times etc until 86
flat_names<-rep(1:ncol, each=nrow)
flat_colors<-rep(x_color, each=nrow)

# Create the flat data frame. put those three columns into one dataframe.
# x_flat_data_frame has three columns. 1st col=flat_names(sample name) 2nd col=flat_expression(gene expression)  
# 3rd col=flat_colors(color)
x_flat_data_frame<-data.frame(flat_expressions, flat_names, flat_colors)
names(x_flat_data_frame)<-c("gene_expression", "sample_name", "color")

# Create a simple violin plot
# aes defines what you should have in x and y.
# x is sample name. you have to transform 1~86 to factor.
# take all y values that correspond to same x and make 1 violin.
p<-ggplot(x_flat_data_frame, aes(x=factor(x_flat_data_frame$sample_name), y=x_flat_data_frame$gene_expression, fill=x_flat_data_frame$color))
p+geom_violin()

To make violin plot more visible, I will just use first few samples.

flat_expressions_reduced<-x_log[,1:6]

dim(flat_expressions_reduced)<-NULL
flat_names_reduced<-rep(1:6, each=nrow)
flat_colors_reduced<-rep(x_color[1:6], each=nrow)

x_flat_df_reduced<-data.frame(flat_expressions_reduced, flat_names_reduced, flat_colors_reduced)
names(x_flat_df_reduced)<-c("gene_expression", "sample_name", "color")

p<-ggplot(x_flat_df_reduced, aes(x=factor(x_flat_df_reduced$sample_name), y=x_flat_df_reduced$gene_expression, fill=x_flat_df_reduced$color))
p+geom_violin()

use boxplots to visualize the distribution of the expression values Group and color by experimental factors.

# Prepare the data for the boxplot

#x_at for box plot. it tells me at which line exactly a particular box plot should be drawn.
#x_at is a vector 1~86

x_at = 1:ncol
x_names = paste("s", 1:ncol)
# Box plot
boxplot(x_data_frame,
        main="gene expression",
        at=x_at,
        # at is defining position fo plot appears
        xlab="normalized expression",
        names=x_names,
        las=2,
        border=factor(x_color),
        horizontal=TRUE)

To make box plot more visible, I will just use first few samples.

# Box plot
#x_data_frame[1:6] this selects first 6 columns(variables)
boxplot(x_data_frame[1:15],
        
        main = "Gene expression",
        
        #at is defining position fo plot appears
        # 1st plot appeas 1st position, 2nd plot appears in 4th position......
        at = c(1,4,2,5,3,6,8,9,7,11,10,12,14,13,15),
        xlab = "normalized expression",
        names = x_names[1:15],
        las = 2,
        border = factor(x_color[1:15]),
        horizontal = TRUE
)

Most variable genes

Identify the 500 most variable genes (with largest variance across samples) and continue working with those.

# Calculate the indexes of x, ordered according to the variance
# apply is a function that applies a certain function to either all columns or all rows of the dataframe.
# 1 means->apply to the rows
# 2 means_> apply to columns
# function->var is the function you want to apply
# calculates variance of each row
x_variance<-apply(x_data_frame, 1, var)
# order returns the vector of indices of ordered vector.
# na.last=T means if they are not assigned values they end up beind at the end
# decreasing=T from biggest to smallest
row_indexes<-order(x_variance, na.last=TRUE, decreasing=TRUE)
#head(row_indexes)
# Select the genes with the highest variance
# to select 500 points 
x_top_500<-head(x_data_frame[row_indexes,],500)
head(x_top_500)
##         bulk.DMSO-Pre-96IFC bulk.TSA-Pre-96IFC bulk.DMSO-Pre-800IFC
## EEF1A1             8.014777           7.437697             8.031036
## NUDT19             3.256398           3.483093             3.235704
## RRM2               6.235273           5.406863             6.200634
## TSPYL1             4.095684           5.013038             4.113957
## SLC35F2            4.369765           3.891466             4.268217
## RAP2B              4.615777           5.694864             4.761932
##         bulk.TSA-Pre-800IFC bulk.DMSO-Post-800IFC bulk.TSA-Post-800IFC
## EEF1A1             7.437673              7.798348             7.568173
## NUDT19             3.043603              3.268930             2.914378
## RRM2               4.991755              6.153708             4.997477
## TSPYL1             4.896899              4.003042             4.901664
## SLC35F2            3.665405              4.199482             3.793348
## RAP2B              5.569699              4.700821             5.565621
##         ifc96.LIB018588_GEN00046618_S78_L001
## EEF1A1                              8.138783
## NUDT19                              5.310767
## RRM2                                0.000000
## TSPYL1                              6.697922
## SLC35F2                             5.776628
## RAP2B                               7.298507
##         ifc96.LIB018588_GEN00046619_S79_L001
## EEF1A1                              7.937117
## NUDT19                              6.191110
## RRM2                                6.953878
## TSPYL1                              6.963859
## SLC35F2                             6.277592
## RAP2B                               7.693593
##         ifc96.LIB018588_GEN00046620_S80_L001
## EEF1A1                              6.174591
## NUDT19                              6.174591
## RRM2                                0.000000
## TSPYL1                              0.000000
## SLC35F2                             0.000000
## RAP2B                               0.000000
##         ifc96.LIB018588_GEN00046621_S81_L001
## EEF1A1                              8.171921
## NUDT19                              6.380830
## RRM2                                0.000000
## TSPYL1                              6.800486
## SLC35F2                             6.057429
## RAP2B                               7.884581
##         ifc96.LIB018588_GEN00046622_S82_L001
## EEF1A1                              6.808619
## NUDT19                              5.139416
## RRM2                                0.000000
## TSPYL1                              0.000000
## SLC35F2                             0.000000
## RAP2B                               4.052458
##         ifc96.LIB018588_GEN00046623_S83_L001
## EEF1A1                              7.916533
## NUDT19                              7.164232
## RRM2                                7.111143
## TSPYL1                              6.374779
## SLC35F2                             6.081471
## RAP2B                               7.852169
##         ifc96.LIB018588_GEN00046624_S84_L001
## EEF1A1                              7.954351
## NUDT19                              7.370651
## RRM2                                5.408860
## TSPYL1                              8.344757
## SLC35F2                             6.260206
## RAP2B                               7.549062
##         ifc96.LIB018588_GEN00046625_S85_L001
## EEF1A1                              7.891281
## NUDT19                              6.327853
## RRM2                                0.000000
## TSPYL1                              6.594290
## SLC35F2                             5.986242
## RAP2B                               7.553178
##         ifc96.LIB018588_GEN00046626_S86_L001
## EEF1A1                              0.000000
## NUDT19                              0.000000
## RRM2                                3.193615
## TSPYL1                              0.000000
## SLC35F2                             0.000000
## RAP2B                               3.369076
##         ifc96.LIB018588_GEN00046627_S87_L001
## EEF1A1                              7.771700
## NUDT19                              7.026673
## RRM2                                0.000000
## TSPYL1                              6.928288
## SLC35F2                             6.739637
## RAP2B                               6.980345
##         ifc96.LIB018588_GEN00046628_S88_L001
## EEF1A1                              4.958238
## NUDT19                              5.361359
## RRM2                                0.000000
## TSPYL1                              4.272091
## SLC35F2                             0.000000
## RAP2B                               4.958238
##         ifc96.LIB018588_GEN00046629_S89_L001
## EEF1A1                              7.952244
## NUDT19                              5.888708
## RRM2                                0.000000
## TSPYL1                              5.391508
## SLC35F2                             5.601949
## RAP2B                               7.149635
##         ifc96.LIB018588_GEN00046630_S90_L001
## EEF1A1                              7.914671
## NUDT19                              6.728051
## RRM2                                4.249541
## TSPYL1                              6.865012
## SLC35F2                             6.596617
## RAP2B                               7.071628
##         ifc96.LIB018588_GEN00046631_S91_L001
## EEF1A1                              8.316330
## NUDT19                              6.631053
## RRM2                                3.377311
## TSPYL1                              6.519324
## SLC35F2                             6.283606
## RAP2B                               7.748913
##         ifc96.LIB018588_GEN00046632_S92_L001
## EEF1A1                              7.654020
## NUDT19                              6.544538
## RRM2                                4.283393
## TSPYL1                              6.497287
## SLC35F2                             6.193856
## RAP2B                               7.597825
##         ifc96.LIB018588_GEN00046633_S93_L001
## EEF1A1                              8.026738
## NUDT19                              6.200969
## RRM2                                0.000000
## TSPYL1                              6.156904
## SLC35F2                             6.574355
## RAP2B                               7.836884
##         ifc96.LIB018588_GEN00046634_S94_L001
## EEF1A1                              7.860224
## NUDT19                              6.424661
## RRM2                                5.794096
## TSPYL1                              6.346750
## SLC35F2                             6.233632
## RAP2B                               7.279995
##         ifc96.LIB018588_GEN00046635_S95_L001
## EEF1A1                              7.977083
## NUDT19                              6.193714
## RRM2                                6.503324
## TSPYL1                              6.623600
## SLC35F2                             6.141573
## RAP2B                               7.150759
##         ifc96.LIB018588_GEN00046636_S96_L001
## EEF1A1                              7.249692
## NUDT19                              6.820064
## RRM2                                0.000000
## TSPYL1                              6.494089
## SLC35F2                             7.061093
## RAP2B                               7.720114
##         ifc96.LIB018589_GEN00046637_S97_L002
## EEF1A1                              8.381627
## NUDT19                              6.035540
## RRM2                                5.828453
## TSPYL1                              5.488037
## SLC35F2                             5.964808
## RAP2B                               6.357653
##         ifc96.LIB018589_GEN00046638_S98_L002
## EEF1A1                              8.110863
## NUDT19                              6.879199
## RRM2                                6.956217
## TSPYL1                              6.284805
## SLC35F2                             6.265127
## RAP2B                               6.585469
##         ifc96.LIB018589_GEN00046639_S99_L002
## EEF1A1                              8.105260
## NUDT19                              7.041037
## RRM2                                6.095102
## TSPYL1                              6.409140
## SLC35F2                             6.178175
## RAP2B                               6.331185
##         ifc96.LIB018589_GEN00046640_S100_L002
## EEF1A1                               8.535092
## NUDT19                               4.742636
## RRM2                                 6.685059
## TSPYL1                               5.431416
## SLC35F2                              6.269156
## RAP2B                                6.420866
##         ifc96.LIB018589_GEN00046641_S101_L002
## EEF1A1                               8.147820
## NUDT19                               7.263454
## RRM2                                 6.921044
## TSPYL1                               5.879259
## SLC35F2                              6.109100
## RAP2B                                6.965006
##         ifc96.LIB018589_GEN00046642_S102_L002
## EEF1A1                               7.997022
## NUDT19                               6.930602
## RRM2                                 7.006324
## TSPYL1                               5.681586
## SLC35F2                              6.590318
## RAP2B                                6.582921
##         ifc96.LIB018589_GEN00046643_S103_L002
## EEF1A1                               8.283312
## NUDT19                               6.029990
## RRM2                                 6.390274
## TSPYL1                               5.324717
## SLC35F2                              6.472512
## RAP2B                                6.640935
##         ifc96.LIB018589_GEN00046644_S104_L002
## EEF1A1                               8.290907
## NUDT19                               6.127205
## RRM2                                 0.000000
## TSPYL1                               6.053837
## SLC35F2                              0.000000
## RAP2B                                7.016507
##         ifc96.LIB018589_GEN00046645_S105_L002
## EEF1A1                               8.344282
## NUDT19                               7.234195
## RRM2                                 6.876042
## TSPYL1                               5.750232
## SLC35F2                              6.770949
## RAP2B                                6.251682
##         ifc96.LIB018589_GEN00046646_S106_L002
## EEF1A1                               8.389450
## NUDT19                               5.750171
## RRM2                                 0.000000
## TSPYL1                               6.123174
## SLC35F2                              6.368597
## RAP2B                                6.607249
##         ifc96.LIB018589_GEN00046647_S107_L002
## EEF1A1                               8.353071
## NUDT19                               6.916726
## RRM2                                 1.022788
## TSPYL1                               6.347214
## SLC35F2                              5.456614
## RAP2B                                6.628197
##         ifc96.LIB018589_GEN00046648_S108_L002
## EEF1A1                               8.589233
## NUDT19                               6.547093
## RRM2                                 0.000000
## TSPYL1                               6.071801
## SLC35F2                              6.560633
## RAP2B                                7.404233
##         ifc96.LIB018589_GEN00046649_S109_L002
## EEF1A1                               8.195026
## NUDT19                               5.048757
## RRM2                                 7.864826
## TSPYL1                               5.807469
## SLC35F2                              6.575191
## RAP2B                                5.358975
##         ifc96.LIB018589_GEN00046650_S110_L002
## EEF1A1                               7.954024
## NUDT19                               6.398260
## RRM2                                 0.000000
## TSPYL1                               5.788114
## SLC35F2                              6.752006
## RAP2B                                6.582323
##         ifc96.LIB018589_GEN00046651_S111_L002
## EEF1A1                              8.5920617
## NUDT19                              6.4603136
## RRM2                                0.7674101
## TSPYL1                              5.5087417
## SLC35F2                             6.4870358
## RAP2B                               6.5384287
##         ifc96.LIB018589_GEN00046652_S112_L002
## EEF1A1                               8.320613
## NUDT19                               6.767691
## RRM2                                 7.059247
## TSPYL1                               5.799263
## SLC35F2                              6.664124
## RAP2B                                5.707324
##         ifc96.LIB018589_GEN00046653_S113_L002
## EEF1A1                               8.484218
## NUDT19                               5.719387
## RRM2                                 3.452177
## TSPYL1                               5.437099
## SLC35F2                              5.952183
## RAP2B                                6.705031
##         ifc96.LIB018589_GEN00046654_S114_L002
## EEF1A1                               8.324573
## NUDT19                               6.037365
## RRM2                                 0.000000
## TSPYL1                               5.218118
## SLC35F2                              6.558652
## RAP2B                                7.040520
##         ifc96.LIB018589_GEN00046655_S115_L002
## EEF1A1                               8.217502
## NUDT19                               5.096100
## RRM2                                 6.581880
## TSPYL1                               5.282592
## SLC35F2                              5.921842
## RAP2B                                6.359082
##         ifc96.LIB018589_GEN00046656_S116_L002
## EEF1A1                               8.933926
## NUDT19                               5.026772
## RRM2                                 1.944360
## TSPYL1                               4.835620
## SLC35F2                              5.702456
## RAP2B                                6.333502
##         ifc96.LIB018588_GEN00046541_S1_L001
## EEF1A1                             7.969540
## NUDT19                             6.288428
## RRM2                               6.517274
## TSPYL1                             6.626784
## SLC35F2                            6.438923
## RAP2B                              8.068990
##         ifc800.COL03_ROW01_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    6.002152
## TSPYL1                                  0.000000
## SLC35F2                                 2.906037
## RAP2B                                   3.721583
##         ifc800.COL03_ROW02_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    5.530829
## TSPYL1                                  2.180958
## SLC35F2                                 3.478748
## RAP2B                                   4.635817
##         ifc800.COL03_ROW03_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    6.863288
## TSPYL1                                  4.833351
## SLC35F2                                 0.000000
## RAP2B                                   0.000000
##         ifc800.COL03_ROW04_LIB018723-GEN00047367
## EEF1A1                                  2.700594
## NUDT19                                  0.000000
## RRM2                                    6.378340
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   0.000000
##         ifc800.COL03_ROW05_LIB018723-GEN00047367
## EEF1A1                                  1.842936
## NUDT19                                  0.000000
## RRM2                                    5.963576
## TSPYL1                                  1.842936
## SLC35F2                                 0.000000
## RAP2B                                   0.000000
##         ifc800.COL03_ROW06_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  4.455511
## RRM2                                    6.842792
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   0.000000
##         ifc800.COL03_ROW07_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    6.111146
## TSPYL1                                  3.572464
## SLC35F2                                 0.000000
## RAP2B                                   3.356318
##         ifc800.COL03_ROW08_LIB018723-GEN00047367
## EEF1A1                                  2.390713
## NUDT19                                  0.000000
## RRM2                                    7.099528
## TSPYL1                                  0.000000
## SLC35F2                                 3.924078
## RAP2B                                   2.390713
##         ifc800.COL03_ROW09_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    6.313580
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   1.808619
##         ifc800.COL03_ROW10_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    6.357654
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   2.010987
##         ifc800.COL03_ROW11_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    4.468720
## TSPYL1                                  0.000000
## SLC35F2                                 2.466456
## RAP2B                                   0.000000
##         ifc800.COL03_ROW12_LIB018723-GEN00047367
## EEF1A1                                  4.321119
## NUDT19                                  0.000000
## RRM2                                    5.932262
## TSPYL1                                  0.000000
## SLC35F2                                 3.511377
## RAP2B                                   2.973906
##         ifc800.COL03_ROW13_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    6.179237
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   4.931641
##         ifc800.COL03_ROW14_LIB018723-GEN00047367
## EEF1A1                                  3.698217
## NUDT19                                  0.000000
## RRM2                                    7.041309
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   2.647952
##         ifc800.COL03_ROW15_LIB018723-GEN00047367
## EEF1A1                                  2.511544
## NUDT19                                  0.000000
## RRM2                                    6.888084
## TSPYL1                                  3.950430
## SLC35F2                                 0.000000
## RAP2B                                   0.000000
##         ifc800.COL03_ROW16_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    5.350289
## TSPYL1                                  0.000000
## SLC35F2                                 2.916280
## RAP2B                                   0.000000
##         ifc800.COL03_ROW17_LIB018723-GEN00047367
## EEF1A1                                  1.977966
## NUDT19                                  0.000000
## RRM2                                    5.982625
## TSPYL1                                  2.979810
## SLC35F2                                 1.414399
## RAP2B                                   1.414399
##         ifc800.COL03_ROW18_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    6.432380
## TSPYL1                                  0.000000
## SLC35F2                                 3.791014
## RAP2B                                   0.000000
##         ifc800.COL03_ROW19_LIB018723-GEN00047367
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    4.706882
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   0.000000
##         ifc800.COL03_ROW20_LIB018723-GEN00047367
## EEF1A1                                  3.445719
## NUDT19                                  2.783956
## RRM2                                    6.561039
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   0.000000
##         ifc800.COL11_ROW01_LIB018723-GEN00047362
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    5.688547
## TSPYL1                                  5.688547
## SLC35F2                                 0.000000
## RAP2B                                   0.000000
##         ifc800.COL11_ROW02_LIB018723-GEN00047362
## EEF1A1                                         0
## NUDT19                                         0
## RRM2                                           0
## TSPYL1                                         0
## SLC35F2                                        0
## RAP2B                                          0
##         ifc800.COL11_ROW03_LIB018723-GEN00047362
## EEF1A1                                  2.913392
## NUDT19                                  0.000000
## RRM2                                    0.000000
## TSPYL1                                  3.579019
## SLC35F2                                 4.478426
## RAP2B                                   0.000000
##         ifc800.COL11_ROW04_LIB018723-GEN00047362
## EEF1A1                                   0.00000
## NUDT19                                   0.00000
## RRM2                                     0.00000
## TSPYL1                                   5.55451
## SLC35F2                                  0.00000
## RAP2B                                    0.00000
##         ifc800.COL11_ROW05_LIB018723-GEN00047362
## EEF1A1                                  0.000000
## NUDT19                                  0.000000
## RRM2                                    5.449964
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   0.000000
##         ifc800.COL11_ROW06_LIB018723-GEN00047362
## EEF1A1                                         0
## NUDT19                                         0
## RRM2                                           0
## TSPYL1                                         0
## SLC35F2                                        0
## RAP2B                                          0
##         ifc800.COL11_ROW07_LIB018723-GEN00047362
## EEF1A1                                  4.285605
## NUDT19                                  1.562528
## RRM2                                    6.234542
## TSPYL1                                  2.144964
## SLC35F2                                 2.510623
## RAP2B                                   2.777792
##         ifc800.COL11_ROW08_LIB018723-GEN00047362
## EEF1A1                                  3.704843
## NUDT19                                  4.687710
## RRM2                                    5.322433
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   1.896627
##         ifc800.COL11_ROW09_LIB018723-GEN00047362
## EEF1A1                                  4.080012
## NUDT19                                  1.765674
## RRM2                                    5.930990
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   3.797949
##         ifc800.COL11_ROW10_LIB018723-GEN00047362
## EEF1A1                                  3.294549
## NUDT19                                  0.000000
## RRM2                                    0.000000
## TSPYL1                                  0.000000
## SLC35F2                                 3.572917
## RAP2B                                   2.907457
##         ifc800.COL11_ROW11_LIB018723-GEN00047362
## EEF1A1                                  4.330370
## NUDT19                                  0.000000
## RRM2                                    6.653784
## TSPYL1                                  0.000000
## SLC35F2                                 4.330370
## RAP2B                                   5.170117
##         ifc800.COL11_ROW12_LIB018723-GEN00047362
## EEF1A1                                  4.557685
## NUDT19                                  3.527150
## RRM2                                    4.273492
## TSPYL1                                  1.422003
## SLC35F2                                 1.422003
## RAP2B                                   3.479829
##         ifc800.COL11_ROW13_LIB018723-GEN00047362
## EEF1A1                                  4.068970
## NUDT19                                  3.044468
## RRM2                                    6.241193
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   4.590000
##         ifc800.COL11_ROW14_LIB018723-GEN00047362
## EEF1A1                                  3.585646
## NUDT19                                  0.000000
## RRM2                                    0.000000
## TSPYL1                                  4.485165
## SLC35F2                                 0.000000
## RAP2B                                   4.133311
##         ifc800.COL11_ROW15_LIB018723-GEN00047362
## EEF1A1                                  4.755991
## NUDT19                                  3.317918
## RRM2                                    0.000000
## TSPYL1                                  0.000000
## SLC35F2                                 3.814146
## RAP2B                                   3.317918
##         ifc800.COL11_ROW16_LIB018723-GEN00047362
## EEF1A1                                         0
## NUDT19                                         0
## RRM2                                           0
## TSPYL1                                         0
## SLC35F2                                        0
## RAP2B                                          0
##         ifc800.COL11_ROW17_LIB018723-GEN00047362
## EEF1A1                                  3.963772
## NUDT19                                  0.000000
## RRM2                                    3.289438
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   4.515215
##         ifc800.COL11_ROW18_LIB018723-GEN00047362
## EEF1A1                                  5.280874
## NUDT19                                  0.000000
## RRM2                                    5.031013
## TSPYL1                                  0.000000
## SLC35F2                                 0.000000
## RAP2B                                   3.514237
##         ifc800.COL11_ROW19_LIB018723-GEN00047362
## EEF1A1                                  4.909205
## NUDT19                                  2.848511
## RRM2                                    1.692879
## TSPYL1                                  0.000000
## SLC35F2                                 2.930683
## RAP2B                                   3.204935
##         ifc800.COL11_ROW20_LIB018723-GEN00047362
## EEF1A1                                  4.854672
## NUDT19                                  0.000000
## RRM2                                    5.177928
## TSPYL1                                  3.911551
## SLC35F2                                 0.000000
## RAP2B                                   3.238215

Sample correlations

Compute and visualize the sample-to-sample correlations

#prepare the data
x_top_500_corr<-x_top_500

#name sample so that it can output easily
#names is gonna name variables (in the dataframe it is column)
names(x_top_500_corr)<-1:ncol

# Compute the correlation between columns
#cor calculates correlation between variables of the data frame.(upper traingle)
correlation<-cor(x_top_500_corr)

# Visualize it using corrplot
# install.packages("corrplot")
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
# plot correlation. used default setting
# sample against sample
# sample is ordered by hclust
corrplot(correlation, type="upper", order="hclust", tl.col="black", tl.srt=45)

Clustering

Compute and visualize a hierarchical clustering of the samples, use package hclust

#prepare data
x_top_500_cluster<-x_top_500
#hclust uses dissimilarity structure.
#dist(t(x_top_500)) t is transposition. dist computes distance between rows. you want distance between samples. so had to transpose so that samples appears in the rows.
#p is power p=2 is euclidean

names(x_top_500_cluster)<-1:ncol
#compute distance matrix
dissimilarity_matrix<-dist(t(x_top_500_cluster), method="euclidean", diag=FALSE, upper=FALSE, p=2)
# Create and plot the cluster
cluster<-hclust(dissimilarity_matrix, method="complete", members=NULL)
plot(cluster)

Heatmap

Use the package pheatmap to generate a heatmap of the expression data.

# install.packages(pheatmap)
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 3.6.3
#prepare the data
x_top_500_heat<-x_top_500
#names know that it will give names to columns(variables)
names(x_top_500_heat)<-1:ncol
row.names(x_top_500_heat)<-1:500
#computes heat map of dataframe (relationship between different samples) this shows hclust and heatmap
#x-axis is sample and y-axis gene (observation).
pheatmap(x_top_500_heat)

PCA

In the exercise session, we saw a potential case where the normal PCA can be misleading.

#constants to control the points
num_points=200
set.seed(6)

# generate the center points with 3 dimensions
# runif generates a vector of random numbers between 0 and 1

center_angles<-runif(num_points)*2*pi
center_x<- 5* runif(num_points)*cos(center_angles)
center_y<-3* runif(num_points)*sin(center_angles)
center_z<-1* runif(num_points)
center_class<-rep(1,times=num_points)


# generate the border points with 3 dimensions

border_angles<-runif(num_points)*2*pi
border_x<-(10+5*runif(num_points))*cos(border_angles)
border_y<-(7+3*runif(num_points))*sin(border_angles)
border_z<-(4+1*runif(num_points))*cos(border_angles)
border_class<-rep(2, times=num_points)
# generate the full data frame.
# pca_data have 3 columns. first half of 2nd column is center_class_x, 2nd half of 2nd column is border class_x
data<-data.frame(c(center_x, border_x), c(center_y, border_y), c(center_z, border_z), c(center_class, border_class))
names(data)<-c("x","y","z", "class")

# Plot the data, coloring them based on the class
# color is a list of 2elments but indexed by by class column of data.
plot(data$x,data$y, col=c("red","blue")[data$class])

* Do the PCA, plot the variance explained by the principal components. Select \(k\) such that you explain \(80\%\) of the variance in your data.

#compute and view the pareto plot of the pca
pca<-prcomp(data, center=TRUE, scale=TRUE)
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4
## Standard deviation     1.3984 1.0159 0.9852 0.2050
## Proportion of Variance 0.4889 0.2580 0.2426 0.0105
## Cumulative Proportion  0.4889 0.7469 0.9895 1.0000
#calculates standard deviation associated every single principal component
#pca.var contains variance explained by each principal component
pca.var<-pca$sdev^2
qcc::pareto.chart(pca.var)

##    
## Pareto chart analysis for pca.var
##        Frequency    Cum.Freq.   Percentage Cum.Percent.
##   A   1.95543374   1.95543374  48.88584354  48.88584354
##   B   1.03200555   2.98743929  25.80013882  74.68598236
##   C   0.97054084   3.95798013  24.26352100  98.94950336
##   D   0.04201987   4.00000000   1.05049664 100.00000000
#in this case you need to choose 3 out of 4 principal components to explain over 80%

Select \(k\) such that you explain \(80\%\) of the variance in your data.

#select k=3 to explain at least 80% of the variance
k=3
# projection onto the lower dimension(=k dimension)
# scale function does "mean centering" and "multiplies each column by certain scale"
# you scale data such that pca$center is the mean of each column and every column is multiplied by this pca$scale
# pca$rotation are eigen vectors and then you selected first k columns (k eigen vectors)
# %*% makes matrix multiplication. 
# data is the original data.this is not centered and scaled.
# but pca is done in mean centered and scaled data. so you need to scale data again
projected_points<- as.data.frame (scale(data, pca$center, pca$scale)%*%pca$rotation[,1:k])
#add additional column (class)
projected_points$class<-data$class
# v1 projection of all the points along the first principal components
names(projected_points)<-c("v1", "v2", "v3", "class")
head(projected_points)
##            v1         v2         v3 class
## 1 -0.04417439 -1.0328398 -0.3740435     1
## 2 -0.52765379 -0.7389911 -0.5756853     1
## 3 -0.05238098 -0.6646648 -0.7585589     1
## 4 -0.07567400 -0.6099183 -0.8473754     1
## 5 -0.04369685 -0.9555762 -0.4158375     1
## 6 -0.14511580 -0.7979205 -0.5896901     1
#col=c("red","green")[pca_projected$class] if class=1 then this outputs red. if class=2 outputs green 
plot(projected_points[,1], projected_points[,2], col=c("red", "blue")[projected_points$class])

problem: Before pca data was well separated by class,but after pca, the data is less separable even if you used prinipal components that explained above 80% variance (98%).This problem happened because this data is not linearly separable but pca works in the data that is linearly separable. So we use kernel to tackle this issue

#pca works in the data that is linearly separable. So use kernel
#data is original data. using 3rd column (z axis), 4th column(class). you are not changing these two columns

kernelized_data<-as.data.frame(data[c(3:4)])

#generated two columns ( r and a )
#r= reduced (distance of each point from origin)
kernelized_data$r<-(data$x^2+data$y^2)^0.5

#a=angle of points (angle or rotation between x-axis and points)
#atan=arc tan=inverse of tangent=gives you this angle
kernelized_data$a<-atan(data$y/data$x)
head(kernelized_data)
##            z class         r           a
## 1 0.66135785     1 1.8439048  1.19259807
## 2 0.30567014     1 4.5972491 -0.01924905
## 3 0.48402936     1 0.7119307 -1.22399216
## 4 0.93482881     1 1.7824262 -0.68722690
## 5 0.09972553     1 1.6322270 -0.96448391
## 6 0.39079038     1 0.9642127 -0.29105009
#you have transformed data (kernelized data) into non-linear space so that when you use pca later it is linearly separable.
plot(kernelized_data$r, kernelized_data$a, col=c("red", "blue")[kernelized_data$class])

kernel_pca<-prcomp(kernelized_data, center=TRUE, scale=TRUE)
kernel_pca_var<-kernel_pca$sdev^2
qcc::pareto.chart(kernel_pca_var)

##    
## Pareto chart analysis for kernel_pca_var
##        Frequency    Cum.Freq.   Percentage Cum.Percent.
##   A   1.95572526   1.95572526  48.89313151  48.89313151
##   B   1.01012551   2.96585077  25.25313777  74.14626927
##   C   0.98010736   3.94595813  24.50268390  98.64895318
##   D   0.05404187   4.00000000   1.35104682 100.00000000
#select k=3 to explain at least 80% of the variance
k=3

projected_kernel_points<-as.data.frame(scale(kernelized_data, center=kernel_pca$center,scale=kernel_pca$scale)%*%kernel_pca$rotation[,1:k])
projected_kernel_points$class<-kernelized_data$class
names(projected_kernel_points)<-c("V1", "V2", "V3", "class")

plot(projected_kernel_points$V1, projected_kernel_points$V2, col=c("red", "blue")[projected_kernel_points$class])

In this case no. Both methods had to use 3 out of 4 PCs to explain at least 80%. But even in this case, kernel pca can separate points according to class with k=1 or k=2. (example shown below) Also in my opinion using kernel pca can lead to smaller k.

#select k=1 to show kernel pca can separate points using even 1 pc
k=1
projected_kernel_points_reduced<-as.data.frame(scale(kernelized_data,  center=kernel_pca$center,scale=kernel_pca$scale)%*%kernel_pca$rotation[,1:k])

projected_kernel_points_reduced$class<-kernelized_data$class
names(projected_kernel_points_reduced)<-c("V1", "class")

#1:400 is an artificial axis. the only PC is y axis
plot(1:400, projected_kernel_points_reduced$V1, col=c("red", "blue")[projected_kernel_points$class])